[FRAUD] 책_코드 함수 만들기

graph
Author

김보람

Published

April 4, 2024

FAURD코드

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result

def mask(df):
    df_tr,df_test = sklearn.model_selection.train_test_split(df, random_state=42)
    N = len(df)
    train_mask = [i in df_tr.index for i in range(N)]
    test_mask = [i in df_test.index for i in range(N)]
    train_mask = np.array(train_mask)
    test_mask = np.array(test_mask)
    return train_mask, test_mask

def edge_index_selected(edge_index):
    theta = edge_index[:,2].mean()
    edge_index[:,2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
    edge_index = edge_index.tolist()
    mean_ = np.array(edge_index)[:,2].mean()
    selected_edges = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
    edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
    return edge_index_selected

with open('../fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    

fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

%run ../function_proposed_gcn.py
%run ../functions-book.py

데이터정리

df50 = throw(fraudTrain, 0.5)
train_mask, test_mask = mask(df50)

책(신용카드 거래에 대한 그래프 분석)

- 이분그래프

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
G_bu = build_graph_bipartite(df50, nx.Graph(name="Bipartite Undirect"))

- 삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    
  • 판매자, 고객, 거래에 노드 할당
G_tu = build_graph_tripartite(df50, nx.Graph())

사기 탐지를 위한 지도 및 비지도 임베딩

지도학습

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder


def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    """
    Build a bipartite graph from the input dataframe.

    Parameters:
        df_input (DataFrame): Input dataframe containing transaction information.
        graph_type (networkx graph type, optional): Type of graph to create. Defaults to nx.Graph().

    Returns:
        networkx.Graph: Bipartite graph.
    """
    df = df_input.copy()
    mapping = {x: node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist() + df["merchant"].values.tolist()))}
    
    df["from"] = df["cc_num"].apply(lambda x: mapping[x])  # 엣지의 출발점
    df["to"] = df["merchant"].apply(lambda x: mapping[x])  # 엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from', 'to']).agg({"is_fraud":"sum", "amt":"sum"}).reset_index()
    df["is_fraud"] = df["is_fraud"].apply(lambda x: 1 if x > 0 else 0)
    
    G = nx.from_edgelist(df[["from", "to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]), int(x["to"])): x["is_fraud"] for idx, x in df[["from", "to", "is_fraud"]].iterrows()}, "label")  # 엣지 속성 설정, 각 속성의 사기 여부
    
    nx.set_edge_attributes(G, {(int(x["from"]), int(x["to"])): x["amt"] for idx, x in df[["from", "to", "amt"]].iterrows()}, "weight")  # 엣지 속성 설정, 각 엣지의 거래 금액

    return G

def train_and_evaluate_node2vec(df, embedding_dimension=128, test_size=0.2, random_state=42):
    """
    Train and evaluate node2vec embeddings with a Random Forest classifier.

    Parameters:
        df (DataFrame): Input dataframe containing transaction information.
        embedding_dimension (int, optional): Dimension of node embeddings. Defaults to 128.
        test_size (float, optional): Proportion of the dataset to include in the test split. Defaults to 0.2.
        random_state (int, optional): Seed used by the random number generator. Defaults to 42.

    Returns:
        dict: Dictionary containing evaluation metrics.
    """
    G = build_graph_bipartite(df)
    
    train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G.edges))), 
                                                                 list(nx.get_edge_attributes(G, "label").values()))
    
    edgs = list(G.edges)
    train_graph = G.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G.nodes) - set(train_graph.nodes)))

    node2vec_train = Node2Vec(train_graph, dimensions=embedding_dimension, weight_key='weight')
    model_train = node2vec_train.fit(window=10)
    
    classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    evaluation_results = {}
    
    for cl in classes:
        embeddings_train = cl(keyed_vectors=model_train.wv)

        train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
        test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

        rf = RandomForestClassifier(n_estimators=1000, random_state=random_state)
        rf.fit(train_embeddings, train_labels)

        yhat = rf.predict(test_embeddings)
        acc = metrics.accuracy_score(y, yhat)
        pre = metrics.precision_score(y, yhat)
        rec = metrics.recall_score(y, yhat)
        f1 = metrics.f1_score(y, yhat)
        auc = metrics.roc_auc_score(y, yhat)
        
        evaluation_results[cl.__name__] = {"accuracy": acc, "precision": pre, "recall": rec, "f1-score": f1, "auc": auc}

    return evaluation_results

# Example usage:
# evaluation_results = train_and_evaluate_node2vec(df50)
# print(evaluation_results)
evaluation_results = train_and_evaluate_node2vec(df50)
print(evaluation_results)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.55it/s]
{'HadamardEmbedder': {'accuracy': 0.5240514905149052, 'precision': 0.7365591397849462, 'recall': 0.09176155391828533, 'f1-score': 0.16319237641453246, 'auc': 0.5290884534498898}, 'AverageEmbedder': {'accuracy': 0.7130758807588076, 'precision': 0.7094682230869002, 'recall': 0.7327528466175486, 'f1-score': 0.7209225700164745, 'auc': 0.7128466083670335}, 'WeightedL1Embedder': {'accuracy': 0.49390243902439024, 'precision': 0.4857142857142857, 'recall': 0.011386470194239785, 'f1-score': 0.02225130890052356, 'auc': 0.4995246264610679}, 'WeightedL2Embedder': {'accuracy': 0.49762872628726285, 'precision': 0.6190476190476191, 'recall': 0.017414601473543203, 'f1-score': 0.033876221498371335, 'auc': 0.5032240930602808}}
evaluation_results
{'HadamardEmbedder': {'accuracy': 0.5240514905149052,
  'precision': 0.7365591397849462,
  'recall': 0.09176155391828533,
  'f1-score': 0.16319237641453246,
  'auc': 0.5290884534498898},
 'AverageEmbedder': {'accuracy': 0.7130758807588076,
  'precision': 0.7094682230869002,
  'recall': 0.7327528466175486,
  'f1-score': 0.7209225700164745,
  'auc': 0.7128466083670335},
 'WeightedL1Embedder': {'accuracy': 0.49390243902439024,
  'precision': 0.4857142857142857,
  'recall': 0.011386470194239785,
  'f1-score': 0.02225130890052356,
  'auc': 0.4995246264610679},
 'WeightedL2Embedder': {'accuracy': 0.49762872628726285,
  'precision': 0.6190476190476191,
  'recall': 0.017414601473543203,
  'f1-score': 0.033876221498371335,
  'auc': 0.5032240930602808}}
train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G_bu.edges))), 
                                                             list(nx.get_edge_attributes(G_bu, "label").values()))
np.array(train_labels).shape
(8854,)
np.array(y).shape
(2952,)
yhat.mean()
0.0
def try_book(fraudTrain, fraudrate, n, prev_results=None):
    if prev_results is None:
        df_results = pd.DataFrame(columns=[
            'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based', 
            'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate', 
            'test_size', 'test_frate', 'hyper_params'
        ])
    else:
        df_results = prev_results
    
    dfrate = throw(fraudTrain, fraudrate)
    df_tr, df_tst = sklearn.model_selection.train_test_split(dfrate)
        
    dfn = fraudTrain[::n]
    dfnn = dfn[~dfn.index.isin(df_tr.index)]
    dfnn = dfnn.reset_index(drop=True)
    df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
   
    df2, mask = concat(df_tr, df_tstn)
    df2['index'] = df2.index
    df = df2.reset_index()

    G_df = build_graph_tripartite(df, nx.Graph())

    train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G_df.edges))), 
                                                                 list(nx.get_edge_attributes(G_df, "label").values()), 
                                                                 test_size=0.20, 
                                                                 random_state=42)

    edgs = list(G_df.edges)
    train_graph = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G_df.nodes) - set(train_graph.nodes)))

    node2vec_train = Node2Vec(train_graph, weight_key='weight')
    model_train = node2vec_train.fit(window=10)

    
    #classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    #evaluation_results = {}
    
    
    embeddings_train = HadamardEmbedder(keyed_vectors=model_train.wv)

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

    rf = RandomForestClassifier(n_estimators=1000, random_state=42)
    rf.fit(train_embeddings, train_labels)

    yhat = rf.predict(test_embeddings)
    acc = metrics.accuracy_score(y, yhat)
    pre = metrics.precision_score(y, yhat)
    rec = metrics.recall_score(y, yhat)
    f1 = metrics.f1_score(y, yhat)
    auc = metrics.roc_auc_score(y, yhat)

    
    result = {
        'model': 'bipartite',
        'time': None,
        'acc': acc,
        'pre': pre,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'graph_based': True,
        'method': "Hadaembedder",
        'throw_rate': df.is_fraud.mean(),
        'train_size': len(train_labels),
        'train_cols': 'amt',
        'train_frate': np.array(train_labels).mean(),
        'test_size': len(y),
        'test_frate': np.array(y).mean(),
        'hyper_params': None,
        'theta': None,
        'gamma': None
    }

    ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S') 
    df_results.to_csv(f'../results/{ymdhms}-pyod.csv',index=False)
    return df_results
try_book(fraudTrain, 0.5, 10)
Generating walks (CPU: 1): 100%|██████████| 10/10 [01:29<00:00,  8.93s/it]
model time acc pre rec f1 auc graph_based method throw_rate train_size train_cols train_frate test_size test_frate hyper_params
dfrate = throw(fraudTrain, 0.5)
df_tr, df_tst = sklearn.model_selection.train_test_split(dfrate)
dfn = fraudTrain[::10]
dfnn = dfn[~dfn.index.isin(df_tr.index)]
dfnn = dfnn.reset_index(drop=True)
df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
    df2, mask = concat(df_tr, df_tstn)
    df2['index'] = df2.index
    df = df2.reset_index()

    G_df = build_graph_tripartite(df, nx.Graph())
 train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G_df.edges))), 
                                                                 list(nx.get_edge_attributes(G_df, "label").values()), 
                                                                 test_size=0.20, 
                                                                 random_state=42)
edgs = list(G_df.edges)
train_graph = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph.add_nodes_from(list(set(G_df.nodes) - set(train_graph.nodes)))

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
Generating walks (CPU: 1): 100%|██████████| 10/10 [01:30<00:00,  9.05s/it]
 embeddings_train = AverageEmbedder(keyed_vectors=model_train.wv)
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(train_embeddings, train_labels)
RandomForestClassifier(n_estimators=1000, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
yhat = rf.predict(test_embeddings)
acc = metrics.accuracy_score(y, yhat)
pre = metrics.precision_score(y, yhat)
rec = metrics.recall_score(y, yhat)
f1 = metrics.f1_score(y, yhat)
auc = metrics.roc_auc_score(y, yhat)
yhat.sum()
0
acc
0.8716776221777651
pre
0.0
rec
0.0
f1
0.0
auc
0.5
 result = {
        'model': 'bipartite',
        'time': None,
        'acc': acc,
        'pre': pre,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'graph_based': True,
        'method': "Hadaembedder",
        'throw_rate': df.is_fraud.mean(),
        'train_size': len(train_labels),
        'train_cols': 'amt',
        'train_frate': np.array(train_labels).mean(),
        'test_size': len(y),
        'test_frate': np.array(y).mean(),
        'hyper_params': None,
        'theta': None,
        'gamma': None
    }



   
   

    
    #classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    #evaluation_results = {}
    
    
   

    


    
   
try_book(fraudTrain, 0.7, 10)
def try_book_A(fraudTrain, fraudrate, n, prev_results=None):
    if prev_results is None:
        df_results = pd.DataFrame(columns=[
            'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based', 
            'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate', 
            'test_size', 'test_frate', 'hyper_params'
        ])
    else:
        df_results = prev_results
    
    dfrate = throw(fraudTrain, fraudrate)
    df_tr, df_tst = sklearn.model_selection.train_test_split(dfrate)
        
    dfn = fraudTrain[::n]
    dfnn = dfn[~dfn.index.isin(df_tr.index)]
    dfnn = dfnn.reset_index(drop=True)
    df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
   
    df2, mask = concat(df_tr, df_tstn)
    df2['index'] = df2.index
    df = df2.reset_index()

    G_df = build_graph_tripartite(df, nx.Graph())

    train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G_df.edges))), 
                                                                 list(nx.get_edge_attributes(G_df, "label").values()), 
                                                                 test_size=0.20, 
                                                                 random_state=42)

    edgs = list(G_df.edges)
    train_graph = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G_df.nodes) - set(train_graph.nodes)))

    node2vec_train = Node2Vec(train_graph, weight_key='weight')
    model_train = node2vec_train.fit(window=10)

    
    #classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    #evaluation_results = {}
    
    
    embeddings_train = AverageEmbedder(keyed_vectors=model_train.wv)

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

    rf = RandomForestClassifier(n_estimators=1000, random_state=42)
    rf.fit(train_embeddings, train_labels)

    yhat = rf.predict(test_embeddings)
    acc = metrics.accuracy_score(y, yhat)
    pre = metrics.precision_score(y, yhat)
    rec = metrics.recall_score(y, yhat)
    f1 = metrics.f1_score(y, yhat)
    auc = metrics.roc_auc_score(y, yhat)

    
    result = {
        'model': 'bipartite',
        'time': None,
        'acc': acc,
        'pre': pre,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'graph_based': True,
        'method': 'AverageEmbedder',
        'throw_rate': df.is_fraud.mean(),
        'train_size': len(train_labels),
        'train_cols': 'amt',
        'train_frate': np.array(train_labels).mean(),
        'test_size': len(y),
        'test_frate': np.array(y).mean(),
        'hyper_params': None,
        'theta': None,
        'gamma': None
    }

    
    df_results = df_results.append(evaluation_results, ignore_index=True)
    
    return df_results
def try_book_W1(fraudTrain, fraudrate, n, prev_results=None):
    if prev_results is None:
        df_results = pd.DataFrame(columns=[
            'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based', 
            'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate', 
            'test_size', 'test_frate', 'hyper_params'
        ])
    else:
        df_results = prev_results
    
    dfrate = throw(fraudTrain, fraudrate)
    df_tr, df_tst = sklearn.model_selection.train_test_split(dfrate)
        
    dfn = fraudTrain[::n]
    dfnn = dfn[~dfn.index.isin(df_tr.index)]
    dfnn = dfnn.reset_index(drop=True)
    df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
   
    df2, mask = concat(df_tr, df_tstn)
    df2['index'] = df2.index
    df = df2.reset_index()

    G_df = build_graph_tripartite(df, nx.Graph())

    train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G_df.edges))), 
                                                                 list(nx.get_edge_attributes(G_df, "label").values()), 
                                                                 test_size=0.20, 
                                                                 random_state=42)

    edgs = list(G_df.edges)
    train_graph = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G_df.nodes) - set(train_graph.nodes)))

    node2vec_train = Node2Vec(train_graph, weight_key='weight')
    model_train = node2vec_train.fit(window=10)

    
    #classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    #evaluation_results = {}
    
    
    embeddings_train = WeightedL1Embedder(keyed_vectors=model_train.wv)

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

    rf = RandomForestClassifier(n_estimators=1000, random_state=42)
    rf.fit(train_embeddings, train_labels)

    yhat = rf.predict(test_embeddings)
    acc = metrics.accuracy_score(y, yhat)
    pre = metrics.precision_score(y, yhat)
    rec = metrics.recall_score(y, yhat)
    f1 = metrics.f1_score(y, yhat)
    auc = metrics.roc_auc_score(y, yhat)

    
    result = {
        'model': 'bipartite',
        'time': None,
        'acc': acc,
        'pre': pre,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'graph_based': True,
        'method': 'WeightedL1Embedder',
        'throw_rate': df.is_fraud.mean(),
        'train_size': len(train_labels),
        'train_cols': 'amt',
        'train_frate': np.array(train_labels).mean(),
        'test_size': len(y),
        'test_frate': np.array(y).mean(),
        'hyper_params': None,
        'theta': None,
        'gamma': None
    }

    
    df_results = df_results.append(evaluation_results, ignore_index=True)
    
    return df_results
def try_book_W2(fraudTrain, fraudrate, n, prev_results=None):
    if prev_results is None:
        df_results = pd.DataFrame(columns=[
            'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based', 
            'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate', 
            'test_size', 'test_frate', 'hyper_params'
        ])
    else:
        df_results = prev_results
    
    dfrate = throw(fraudTrain, fraudrate)
    df_tr, df_tst = sklearn.model_selection.train_test_split(dfrate)
        
    dfn = fraudTrain[::n]
    dfnn = dfn[~dfn.index.isin(df_tr.index)]
    dfnn = dfnn.reset_index(drop=True)
    df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
   
    df2, mask = concat(df_tr, df_tstn)
    df2['index'] = df2.index
    df = df2.reset_index()

    G_df = build_graph_tripartite(df, nx.Graph())

    train_edges, test_edges, train_labels, y = train_test_split(list(range(len(G_df.edges))), 
                                                                 list(nx.get_edge_attributes(G_df, "label").values()), 
                                                                 test_size=0.20, 
                                                                 random_state=42)

    edgs = list(G_df.edges)
    train_graph = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G_df.nodes) - set(train_graph.nodes)))

    node2vec_train = Node2Vec(train_graph, weight_key='weight')
    model_train = node2vec_train.fit(window=10)

    
    #classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    #evaluation_results = {}
    
    
    embeddings_train = WeightedL2Embedder(keyed_vectors=model_train.wv)

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

    rf = RandomForestClassifier(n_estimators=1000, random_state=42)
    rf.fit(train_embeddings, train_labels)

    yhat = rf.predict(test_embeddings)
    acc = metrics.accuracy_score(y, yhat)
    pre = metrics.precision_score(y, yhat)
    rec = metrics.recall_score(y, yhat)
    f1 = metrics.f1_score(y, yhat)
    auc = metrics.roc_auc_score(y, yhat)

    
    result = {
        'model': 'bipartite',
        'time': None,
        'acc': acc,
        'pre': pre,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'graph_based': True,
        'method': 'WeightedL2Embedder',
        'throw_rate': df.is_fraud.mean(),
        'train_size': len(train_labels),
        'train_cols': 'amt',
        'train_frate': np.array(train_labels).mean(),
        'test_size': len(y),
        'test_frate': np.array(y).mean(),
        'hyper_params': None,
        'theta': None,
        'gamma': None
    }

    
    df_results = df_results.append(evaluation_results, ignore_index=True)
    
    return df_results